R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

install packages

#install.packages("data.table")
#install.packages("tm")
#update.packages("tm",  checkBuilt = TRUE)
#install.packages("SnowballC")
#install.packages("rsconnect")
#install.packages("dplyr")
#install.packages("tidytext")
#install.packages("mldr")
#install.packages("Hmisc")
#install.packages("ggplot2")
#install.packages("wordcloud")
#install.packages("RColorBrewer")
#install.packages("stringr")
#install.packages("xgboost")
#install.packages("DT")
#install.packages("dplyr")
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DT)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(stringr)
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
library(ggplot2)
library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(mldr)
## 
## Attaching package: 'mldr'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library("data.table")
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library("tm")
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(SnowballC)
library(rsconnect)
## Warning: package 'rsconnect' was built under R version 3.4.4
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.4
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute

importing of training and test data

setwd("/Users/teenathampan/CapstoneProject/Project/")
traindata = fread("train.csv", header = "auto", sep="auto", nrows=-1L,blank.lines.skip=TRUE, encoding="UTF-8")
testdata = fread("test.csv", header = "auto", sep="auto", nrows=-1L,blank.lines.skip=TRUE, encoding="UTF-8")

# marking data as train and test data as well as creating dummy class columns in order to clean comments at the same time

traindata$type <- "train"

testdata$toxic<- 0
testdata$severe_toxic<- 0
testdata$obscene<- 0
testdata$threat<- 0
testdata$insult<- 0
testdata$identity_hate<- 0
testdata$type<- "test"

dataset <- rbind(traindata, testdata)

str(dataset)
## Classes 'data.table' and 'data.frame':   312735 obs. of  9 variables:
##  $ id           : chr  "0000997932d777bf" "000103f0d9cfb60f" "000113f07ec002fd" "0001b41b1c6bb37e" ...
##  $ comment_text : chr  "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms"| __truncated__ "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)" "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information"| __truncated__ "\"\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be l"| __truncated__ ...
##  $ toxic        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ severe_toxic : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ obscene      : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ threat       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ insult       : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ identity_hate: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ type         : chr  "train" "train" "train" "train" ...
##  - attr(*, ".internal.selfref")=<externalptr>
# function to clean comment field

clean_text<- function(text){
  
  # text to lower case
  text <- tolower(text)
  
  # remove linebreaks
  text<- gsub("\n", " ", text)
  
  # remove extra white spaces to one space
  text<- gsub("\\s+", " ", text)
  
  # transform short forms
  text<- gsub("'ll", " will", text)
  text<- gsub("i'm", "i am", text)
  text<- gsub("'re", " are", text)
  text<- gsub("'s", " is", text)
  text<- gsub("'ve", " have", text)
  text<- gsub("'d", " would", text)
  text<- gsub("can't", "can not", text)
  text<- gsub("don't", "do not", text)
  text<- gsub("doesn't", "does not", text)
  text<- gsub("isn't", "is not", text)
  text<- gsub("aren't", "are not", text)
  text<- gsub("couldn't", "could not", text)
  text<- gsub("mustn't", "must not", text)
  text<- gsub("didn't", "did not", text)
  text<- gsub("weren't", "were not", text)
  
  # remove incorrect text
  text<- gsub("f+u+c+k+\\b", "fuck", text)
  
  # remove graphics
  text<- gsub("[^[:graph:]]", " ", text)
  # remove punctuation
  text<- gsub("[[:punct:]]", " ", text)
  # remove digits
  text<- gsub("[[:digit:]]", " ", text)
  
  # strip multiple whitspace to one
  text<- gsub("\\s+", " ", text)
  
  # remove "shittext"
  text <- gsub("\\b(a|e)w+\\b", "AWWWW", text)
  text <- gsub("\\b(y)a+\\b", "YAAAA", text)
  text <- gsub("\\b(w)w+\\b", "WWWWW", text)
  text <- gsub("\\b((l+)(a+))+\\b", "LALALA", text)
  text <- gsub("(w+)(o+)(h+)(o+)", "WOHOO", text)
  text <- gsub("\\b(d?(u+)(n+)?(h+))\\b", "UUUHHH", text)
  text <- gsub("\\b(a+)(r+)(g+)(h+)\\b", "ARGH", text)
  text <- gsub("\\b(a+)(w+)(h+)\\b", "AAAWWHH", text)
  text <- gsub("\\b(p+)(s+)(h+)\\b", "SHHHHH", text)
  text <- gsub("\\b((s+)(e+)?(h+))+\\b", "SHHHHH", text)
  text <- gsub("\\b(s+)(o+)\\b", "", text)
  text <- gsub("\\b(h+)(m+)\\b", "HHMM", text)
  text <- gsub("\\b((b+)(l+)(a+)(h+)?)+\\b", "BLABLA", text)
  text <- gsub("\\b((y+)(e+)(a+)(h+)?)+\\b", "YEAH", text)
  text <- gsub("\\b((z+)?(o+)(m+)(f+)?(g+))+\\b", "OMG", text)
  text <- gsub("aa(a+)", "a", text)
  text <- gsub("ee(e+)", "e", text)
  text <- gsub("i(i+)", "i", text)
  text <- gsub("oo(o+)", "o", text)
  text <- gsub("uu(u+)", "u", text)
  text <- gsub("\\b(u(u+))\\b", "u", text)
  text <- gsub("y(y+)", "y", text)
  text <- gsub("hh(h+)", "h", text)
  text <- gsub("gg(g+)", "g", text)
  text <- gsub("tt(t+)\\b", "t", text)
  text <- gsub("(tt(t+))", "tt", text)
  text <- gsub("mm(m+)", "m", text)
  text <- gsub("ff(f+)", "f", text)
  text <- gsub("cc(c+)", "c", text)
  text <- gsub("\\b(kkk)\\b", "KKK", text)
  text <- gsub("\\b(pkk)\\b", "PKK", text)
  text <- gsub("kk(k+)", "kk", text)
  text <- gsub("fukk", "fuck", text)
  text <- gsub("k(k+)\\b", "k", text)
  text <- gsub("f+u+c+k+\\b", "fuck", text)
  text <- gsub("((a+)|(h+)){3,}", "HAHEHI", text)
  text <- gsub("mothjer", "mother", text)
  text <- gsub("wikipedia ", " ", text)
  text <- gsub("wiki ", " ", text)
  
  #remove non ascii words
  text <- gsub("[^\x20-\x7e]+", " ", text)
  
  # remove stopwords
otherstopwords<- c("can", "will", "don", "now", "just", "also", "may", "get", "well", "need", "say", "way", "want", "see", "read", "look", "stop", "like", "really", "however", "let", "ask", "used", "made", "much", "utc", "added", "didn", "sure", "put", "better", "using", "tell", "anything", "one", "two", "wiki", "wikipedia", "first", "second", "however", "hahehi", "peopl", "talk", "page", "edit", "articl", "user", "make", "put", "far", "bit", "well", "still", "much", "one", "two", "don", "now", "even", "article", "articles", "edit", "edits", "page", "pages","talk", "editor", "ax", "edu", "subject", "lines", "like", "likes", "line","uh", "oh", "also", "get", "just", "hi", "hello", "ok", "ja", "editing", "edited","dont", "wikipedia", "hey", "however", "id", "yeah", "yo", "use", "need", "take", "give", "say", "user", "day", "want", "tell", "even", "look", "one", "make", "come", "see", "said", "now", "know", "talk", "read", "time", "sentence", "ain't", "wow", "image", "jpg", "copyright","wikiproject", "background color", "align", "px", "pixel",
                      "org", "com", "en", "ip", "ip address", "http", "www", "html", "htm",
                      "wikimedia", "https", "httpimg", "url", "urls", "utc", "uhm",
                      "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
                      "you", "your", "yours", "yourself", "yourselves", 
                      "he", "him", "his", "himself", 
                      "she", "her", "hers", "herself", 
                      "it", "its", "itself",    
                      "they", "them", "their", "theirs", "themselves",
                      "i'm", "you're", "he's", "i've", "you've", "we've", "we're",
                      "she's", "it's", "they're", "they've", 
                      "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", 
                      "i'll", "you'll", "he'll", "she'll", "we'll", "they'll",
                      "what", "which", "who", "whom", "this", "that", "these", "those",
                      "am", "can", "will", "not",
                      "is", "was", "were", "have", "has", "had", "having", "wasn't", "weren't", "hasn't",
                      "are", "cannot", "isn't", "aren't", "doesn't", "don't", "can't", "couldn't", "mustn't", "didn't",    
                      "haven't", "hadn't", "won't", "wouldn't",  
                      "do", "does", "did", "doing", "would", "should", "could",  
                      "be", "been", "being", "ought", "shan't", "shouldn't", "let's", "that's", "who's", "what's", "here's",
                      "there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if",
                      "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against",
                      "between", "into", "through", "during", "before", "after", "above", "below", "to", "from",
                      "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once",
                      "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
                      "most", "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than",
                      "too", "very")
text <- removeWords(text, stopwords("en"))
text <- removeWords(text, otherstopwords)
  
   return(unname(text))
}

cleaned dataset

dataset$cleanedtext <- clean_text(dataset$comment_text)

# separate out training and test data

train <-subset(dataset, type=="train",select=c(-comment_text, -type))
test<-subset(dataset, type=="test",select=c(id, cleanedtext))

exploratory analysis - toxic, severe_toxic, obscene, threat, insult, identity_hate / toxic, obscene, insult are thee classes that go together

summary(train)
##       id                toxic          severe_toxic         obscene       
##  Length:159571      Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  Class :character   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Mode  :character   Median :0.00000   Median :0.000000   Median :0.00000  
##                     Mean   :0.09584   Mean   :0.009996   Mean   :0.05295  
##                     3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000  
##                     Max.   :1.00000   Max.   :1.000000   Max.   :1.00000  
##      threat             insult        identity_hate     
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.00000   Median :0.000000  
##  Mean   :0.002995   Mean   :0.04936   Mean   :0.008805  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.000000   Max.   :1.00000   Max.   :1.000000  
##  cleanedtext       
##  Length:159571     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(train)
## Classes 'data.table' and 'data.frame':   159571 obs. of  8 variables:
##  $ id           : chr  "0000997932d777bf" "000103f0d9cfb60f" "000113f07ec002fd" "0001b41b1c6bb37e" ...
##  $ toxic        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ severe_toxic : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ obscene      : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ threat       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ insult       : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ identity_hate: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cleanedtext  : chr  "explanation       username hardcore metallica fan  reverted    vandalisms  closure   gas   voted  new york doll"| __truncated__ "d AWWWW  matches  background colour   seemingly stuck  thanks  january  " " man     trying   war      guy  constantly removing relevant information  talking     instead      seems  care "| __truncated__ "       real suggestions  improvement  wondered   section statistics   later    subsection  types  accidents  th"| __truncated__ ...
##  - attr(*, ".internal.selfref")=<externalptr>
dim(train)
## [1] 159571      8
colSums(sapply(train, is.na))
##            id         toxic  severe_toxic       obscene        threat 
##             0             0             0             0             0 
##        insult identity_hate   cleanedtext 
##             0             0             0
# plotting number of documents with each toxicity category
train2<-train[,c(2:7)]
x<-barplot(colSums(train2), ylim = c(0,20000), xlab="type of toxicity", ylab="frequency", main ="Frequency of each toxicity level")
y<-as.matrix(colSums(train2))
text(x, y, labels=as.character(y), pos = 3, cex=1)

train$toxcount<-rowSums(train[,2:7])

# plotting number of documents with the multiple labels per document
Num_Class<-table(train$toxcount)
tox_class<-as.data.frame(Num_Class, row.names = NULL, responseName = "Num_Doc", sep=" ")
colnames(tox_class)[colnames(tox_class)=="Var1"] <- "Num_of_classes"
c1<-ggplot(data = subset(tox_class, Num_of_classes!=0), aes(x=Num_of_classes, y=Num_Doc)) + geom_bar(stat="identity")+ggtitle("Document frequency for multi labels")+geom_text(aes(label = Num_Doc), vjust = 1.5, color = "red")
ggsave("Document frequency for multi labels.png", width=297, height =210, units = "mm")

TDM, words and wordcloud for label toxic test

# load the data as a corpus
train_to <- train[toxic==1]$cleanedtext
train_to <- Corpus(VectorSource(train_to))  %>%
  tm_map(stemDocument)

## build  a term document matrix
tdm_to<-TermDocumentMatrix(train_to)
m_to<-as.matrix(tdm_to)
v_to<-sort(rowSums(m_to), decreasing=TRUE)
d_to<-data.frame(word = names(v_to), freq=v_to)

## generate wordcloud with layout to include title
set.seed(1234)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=.5, y=.1, "Toxic wordcloud")
wordcloud(words=d_to$word, freq=d_to$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

TDM, words and wordcloud for label severe_toxic test

# load the data as a corpus
train_st <- train[severe_toxic==1]$cleanedtext
train_st <- Corpus(VectorSource(train_st))  %>%
  tm_map(stemDocument)

## build  a term document matrix
tdm_st<-TermDocumentMatrix(train_st)
m_st<-as.matrix(tdm_st)
v_st<-sort(rowSums(m_st), decreasing=TRUE)
d_st<-data.frame(word = names(v_st), freq=v_st)

## generate wordcloud with layout to include title

set.seed(1234)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.1, "Severe Toxic wordcloud")
wordcloud(words=d_st$word, freq=d_st$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

TDM, words and wordcloud for label obscene test

# load the data as a corpus
train_o <- train[obscene==1]$cleanedtext
train_o <- Corpus(VectorSource(train_o))  %>%
  tm_map(stemDocument)

## build  a term document matrix
tdm_o<-TermDocumentMatrix(train_o)
m_o<-as.matrix(tdm_o)
v_o<-sort(rowSums(m_o), decreasing=TRUE)
d_o<-data.frame(word = names(v_o), freq=v_o)

## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.1, "Obscene wordcloud")
wordcloud(words=d_o$word, freq=d_o$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

TDM, words and wordcloud for label threat test

# load the data as a corpus
train_t <- train[threat==1]$cleanedtext
train_t <- Corpus(VectorSource(train_t))  %>%
  tm_map(stemDocument)

## build  a term document matrix
tdm_t<-TermDocumentMatrix(train_t)
m_t<-as.matrix(tdm_t)
v_t<-sort(rowSums(m_t), decreasing=TRUE)
d_t<-data.frame(word = names(v_t), freq=v_t)

## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=0.1, "Threat wordcloud")
wordcloud(words=d_t$word, freq=d_t$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

TDM, words and wordcloud for label insult test

# load the data as a corpus
train_i <- train[insult==1]$cleanedtext
train_i <- Corpus(VectorSource(train_i))  %>%
  tm_map(stemDocument)

## build  a term document matrix
tdm_i<-TermDocumentMatrix(train_i)
m_i<-as.matrix(tdm_i)
v_i<-sort(rowSums(m_i), decreasing=TRUE)
d_i<-data.frame(word = names(v_i), freq=v_i)

## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.5, "Insult wordcloud")
wordcloud(words=d_i$word, freq=d_i$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

TDM, words and wordcloud for label identity_hate test

traindata_ih<-subset(traindata, identity_hate==1)
# load the data as a corpus
train_ih <- train[identity_hate==1]$cleanedtext
train_ih <- Corpus(VectorSource(train_ih))  %>%
  tm_map(stemDocument)

## build  a term document matrix
tdm_ih<-TermDocumentMatrix(train_ih)
m_ih<-as.matrix(tdm_ih)
v_ih<-sort(rowSums(m_ih), decreasing=TRUE)
d_ih<-data.frame(word = names(v_ih), freq=v_ih)

## generate wordcloud
set.seed(1234)
layout(matrix(c(1,2), nrow=2), heights=c(1,4))
par(mar=rep(0,4))
plot.new()
text(x=.5, y=.5, "Identity Hate wordcloud")
wordcloud(words=d_ih$word, freq=d_ih$freq, min.freq=1, max.words = 200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

Tokenisation of the sentences

The sentences are broken up into words.

trainWords <- train %>%
  unnest_tokens(word, cleanedtext) %>%
  count(toxic,severe_toxic,obscene,threat,insult,identity_hate,word) %>%
  ungroup()

datatable(head(trainWords,20), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

Unique Categories of Text

The combinations of toxic,severe toxic,obscene,threat,insult and identity hate will create unique categories. We will display those categories here.

trainWords <- train %>%
  unnest_tokens(word, cleanedtext) %>%
  count(toxic,severe_toxic,obscene,threat,insult,identity_hate,word) %>%
  ungroup()

total_words <- trainWords %>% 
  group_by(toxic,severe_toxic,obscene,threat,insult,identity_hate) %>% 
  summarise(total = sum(n))

total_words

The Math

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
IDF(t) = log_e(Total number of documents / Number of documents with term t in it).
Value = TF * IDF

Twenty Most Important words

Here using TF-IDF , we investigate the Twenty Most Important words

Category =1:41
fillColor = "#8db600"
fillColor2 = "#ffbf00"

total_words$Category = Category

trainWords <- left_join(trainWords, total_words)

#Now we are ready to use the bind_tf_idf which computes the tf-idf for each term. 
trainWords <- trainWords %>%
  bind_tf_idf(word, Category, n)


plot_trainWords <- trainWords %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word))))

plot_trainWords %>% 
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor) +
  labs(x = NULL, y = "tf-idf", title = "TDIF for all words") +
  coord_flip() +
  theme_bw()

Toxic TF-IDF

We plot the TF-IDF for the Toxic Comments

plot_trainWords %>%
  filter(toxic == 1 ) %>%
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor2) +
  labs(x = NULL, y = "tf-idf", title = "TDIF for Toxic labelled comments") +
  coord_flip() +
  theme_bw()

Severe Toxic TF-IDF

We plot the TF-IDF for the Severe Toxic Comments

plot_trainWords %>%
  filter(severe_toxic == 1 ) %>%
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor2) +
  labs(x = NULL, y = "tf-idf", title = "TDIF for Severe Toxic labelled comments") +
  coord_flip() +
  theme_bw()

Obscene TF-IDF

We plot the TF-IDF for the Obscene Comments

plot_trainWords %>%
  filter(obscene == 1 ) %>%
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor2) +
  labs(x = NULL, y = "tf-idf", title = "TDIF for obscene labelled comments") +
  coord_flip() +
  theme_bw()

Threat TF-IDF

We plot the TF-IDF for the Threat Comments

plot_trainWords %>%
  filter(threat == 1 ) %>%
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor2) +
  labs(x = NULL, y = "tf-idf", title="TDIF for threat labelled comments") +
  coord_flip() 

  theme_bw()
## List of 57
##  $ line                 :List of 6
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                 :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                 :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 5.5 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top     :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 5.5 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 5.5 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.right   :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 5.5
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text            :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         :Class 'rel'  num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 2.2 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top      :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 2.2 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 2.2 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.right    :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 2.2
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks           :List of 6
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ axis.ticks.length    :Class 'unit'  atomic [1:1] 2.75
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##  $ axis.line            : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x          : NULL
##  $ axis.line.y          : NULL
##  $ legend.background    :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.margin        :Classes 'margin', 'unit'  atomic [1:4] 0.2 0.2 0.2 0.2
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ legend.spacing       :Class 'unit'  atomic [1:1] 0.4
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ legend.spacing.x     : NULL
##  $ legend.spacing.y     : NULL
##  $ legend.key           :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.key.size      :Class 'unit'  atomic [1:1] 1.2
##   .. ..- attr(*, "valid.unit")= int 3
##   .. ..- attr(*, "unit")= chr "lines"
##  $ legend.key.height    : NULL
##  $ legend.key.width     : NULL
##  $ legend.text          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align    : NULL
##  $ legend.title         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align   : NULL
##  $ legend.position      : chr "right"
##  $ legend.direction     : NULL
##  $ legend.justification : chr "center"
##  $ legend.box           : NULL
##  $ legend.box.margin    :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 0
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ legend.box.background: list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing   :Class 'unit'  atomic [1:1] 0.4
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ panel.background     :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.border         :List of 5
##   ..$ fill         : logi NA
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.spacing        :Class 'unit'  atomic [1:1] 5.5
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##  $ panel.spacing.x      : NULL
##  $ panel.spacing.y      : NULL
##  $ panel.grid.major     :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.minor     :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ size         : num 0.25
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.ontop          : logi FALSE
##  $ plot.background      :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : chr "white"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ plot.title           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 6.6 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.subtitle        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 0.9
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 4.95 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 0.9
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 4.95 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.margin          :Classes 'margin', 'unit'  atomic [1:4] 5.5 5.5 5.5 5.5
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##  $ strip.background     :List of 5
##   ..$ fill         : chr "grey85"
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ strip.placement      : chr "inside"
##  $ strip.text           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         :Class 'rel'  num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 5.5 0 5.5 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 5.5 0 5.5
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.switch.pad.grid:Class 'unit'  atomic [1:1] 0.1
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ strip.switch.pad.wrap:Class 'unit'  atomic [1:1] 0.1
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

Insult TF-IDF

We plot the TF-IDF for the Insult Comments

plot_trainWords %>%
  filter(insult == 1 ) %>%
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor2) +
  labs(x = NULL, y = "tf-idf", title = "TDIF for Insult labelled comments") +
  coord_flip() +
  theme_bw()

Identity Hate TF-IDF

We plot the TF-IDF for the Identity hate Comments

plot_trainWords %>%
  filter(identity_hate == 1 ) %>%
  top_n(20) %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(fill = fillColor2) +
  labs(x = NULL, y = "tf-idf", title = "TDIF for Identity Hate labelled comments") +
  coord_flip() +
  theme_bw()

calculating dtm for test and train data

corpus = VCorpus(VectorSource(train$cleanedtext))
dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.99)
dataset = as.data.frame(as.matrix(dtm))


corpus = VCorpus(VectorSource(test$cleanedtext))%>%
  tm_map(stemDocument)

dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.99)
datasetTest = as.data.frame(as.matrix(dtm))

## brining all the common words together from train and text

colnamesSame = intersect(colnames(dataset),colnames(datasetTest))

dataset = dataset[ , (colnames(dataset) %in% colnamesSame)]
datasetTest = datasetTest[ , (colnames(datasetTest) %in% colnamesSame)]

Train and Test split

#partitioning data

# inspect(dtm[20:30, 50:75])
# <<DocumentTermMatrix (documents: 11, terms: 26)>>
# Non-/sparse entries: 1/285
# Sparsity           : 100%
# Maximal term length: 9
# Weighting          : term frequency (tf)
# Sample             :
#     Terms
# Docs book call came care case categori caus certain chang check
  # 20    0    0    0    0    0        0    0       0     0     0
  # 21    0    0    0    0    0        0    0       0     0     0
  # 22    0    0    0    0    0        0    0       0     0     0
  # 23    0    0    0    0    0        0    0       0     0     0
  # 24    0    0    0    0    0        0    1       0     0     0
  # 25    0    0    0    0    0        0    0       0     0     0
  # 26    0    0    0    0    0        0    0       0     0     0
  # 27    0    0    0    0    0        0    0       0     0     0
  # 28    0    0    0    0    0        0    0       0     0     0
  # 29    0    0    0    0    0        0    0       0     0     0

# cross validation of Toxic training data
set.seed(17)
train_index <- sample(1:nrow(dataset), nrow(dataset)*.75)

#labels for train and test
toxic <- train[train_index,]$toxic
part_test_toxic <- train[-train_index,]$toxic

part_train <- cbind(dataset[train_index,],toxic)
part_train$toxic = as.factor(part_train$toxic)
levels(part_train$toxic) = make.names(unique(part_train$toxic))
part_test <- dataset[-train_index,]


formula = toxic ~ .

#fitControl <- trainControl(method="none",classProbs=TRUE, summaryFunction=twoClassSummary)
fitControl <- trainControl(method="cv", number = 3, returnResamp = "all", classProbs=TRUE, summaryFunction=twoClassSummary, allowParallel = T)
#fitControl <- trainControl(method="repeatedcv", number = 10, repeats = 3, classProbs=TRUE, summaryFunction=twoClassSummary)

xgbGrid <- expand.grid(nrounds = 500, 
                       eta = .3,
                       max_depth = 6,
                       gamma = 0,
                       colsample_bytree = .8, 
                       min_child_weight = 1, 
                       subsample = 1) 


set.seed(13)
start_time <-Sys.time()

ToxicXGBt = train(formula, data = part_train,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize = FALSE)

ToxicXGBt
## eXtreme Gradient Boosting 
## 
## 119678 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 79785, 79785, 79786 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7744686  0.9972838  0.1701927
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.469419 mins
predToxict = predict(ToxicXGBt,part_test,type ='prob')
predToxict.resp <- ifelse(predToxict$X1 >= 0.80, 1, 0)
confusionMatrix(predToxict.resp,part_test_toxic, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 36022  3232
##          1    17   622
##                                           
##                Accuracy : 0.9186          
##                  95% CI : (0.9158, 0.9212)
##     No Information Rate : 0.9034          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2564          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.16139         
##             Specificity : 0.99953         
##          Pos Pred Value : 0.97340         
##          Neg Pred Value : 0.91766         
##              Prevalence : 0.09661         
##          Detection Rate : 0.01559         
##    Detection Prevalence : 0.01602         
##       Balanced Accuracy : 0.58046         
##                                           
##        'Positive' Class : 1               
## 
#####################################################################################################
# cross validation of Severe Toxic training data
set.seed(17)

#labels for train and test
severe_toxic <- train[train_index,]$severe_toxic
part_test_severe_toxic <- train[-train_index,]$severe_toxic

part_train <- cbind(dataset[train_index,],severe_toxic)
part_train$severe_toxic = as.factor(part_train$severe_toxic)
levels(part_train$severe_toxic) = make.names(unique(part_train$severe_toxic))
part_test <- dataset[-train_index,]

formula = severe_toxic ~ .

set.seed(13)

start_time <-Sys.time()
SToxicXGBt = train(formula, data = part_train,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

SToxicXGBt
## eXtreme Gradient Boosting 
## 
## 119678 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 79785, 79785, 79786 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8726972  0.9988945  0.1149607
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.600067 mins
predSToxict = predict(SToxicXGBt,part_test,type ='prob')
predSToxict.resp <- ifelse(predSToxict$X1 >= 0.80, 1, 0)
confusionMatrix(predSToxict.resp,part_test_severe_toxic, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 39474   404
##          1     7     8
##                                           
##                Accuracy : 0.9897          
##                  95% CI : (0.9887, 0.9907)
##     No Information Rate : 0.9897          
##     P-Value [Acc > NIR] : 0.4933          
##                                           
##                   Kappa : 0.0368          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0194175       
##             Specificity : 0.9998227       
##          Pos Pred Value : 0.5333333       
##          Neg Pred Value : 0.9898691       
##              Prevalence : 0.0103276       
##          Detection Rate : 0.0002005       
##    Detection Prevalence : 0.0003760       
##       Balanced Accuracy : 0.5096201       
##                                           
##        'Positive' Class : 1               
## 
#####################################################################################################
# cross validation of Obscene training data
set.seed(17)

#labels for train and test
obscene <- train[train_index,]$obscene
part_test_obscene <- train[-train_index,]$obscene

part_train <- cbind(dataset[train_index,],obscene)
part_train$obscene = as.factor(part_train$obscene)
levels(part_train$obscene) = make.names(unique(part_train$obscene))
part_test <- dataset[-train_index,]

formula = obscene ~ .

set.seed(13)

start_time <-Sys.time()
ObsceneXGBt = train(formula, data = part_train,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

ObsceneXGBt
## eXtreme Gradient Boosting 
## 
## 119678 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 79786, 79785, 79785 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8083263  0.9988089  0.2764882
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.3436 mins
predObscenet = predict(ObsceneXGBt,part_test,type ='prob')
predObscenet.resp <- ifelse(predObscenet$X1 >= 0.80, 1, 0)
confusionMatrix(predObscenet.resp,part_test_obscene, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 37746  1540
##          1    31   576
##                                           
##                Accuracy : 0.9606          
##                  95% CI : (0.9587, 0.9625)
##     No Information Rate : 0.947           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4091          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.27221         
##             Specificity : 0.99918         
##          Pos Pred Value : 0.94893         
##          Neg Pred Value : 0.96080         
##              Prevalence : 0.05304         
##          Detection Rate : 0.01444         
##    Detection Prevalence : 0.01522         
##       Balanced Accuracy : 0.63570         
##                                           
##        'Positive' Class : 1               
## 
#####################################################################################################
# cross validation of Insult training data
set.seed(17)

#labels for train and test
insult <- train[train_index,]$insult
part_test_insult <- train[-train_index,]$insult

part_train <- cbind(dataset[train_index,],insult)
part_train$insult = as.factor(part_train$insult)
levels(part_train$insult) = make.names(unique(part_train$insult))
part_test <- dataset[-train_index,]

formula = insult ~ .

set.seed(13)

start_time <-Sys.time()
InsultXGBt = train(formula, data = part_train,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

InsultXGBt
## eXtreme Gradient Boosting 
## 
## 119678 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 79785, 79785, 79786 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7995579  0.9959042  0.2122995
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 5.225757 mins
predinsultt = predict(InsultXGBt,part_test,type ='prob')
predinsultt.resp <- ifelse(predinsultt$X1 >= 0.80, 1, 0)
confusionMatrix(predinsultt.resp,part_test_insult, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 37884  1827
##          1    34   148
##                                           
##                Accuracy : 0.9534          
##                  95% CI : (0.9512, 0.9554)
##     No Information Rate : 0.9505          
##     P-Value [Acc > NIR] : 0.004139        
##                                           
##                   Kappa : 0.13            
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.074937        
##             Specificity : 0.999103        
##          Pos Pred Value : 0.813187        
##          Neg Pred Value : 0.953993        
##              Prevalence : 0.049507        
##          Detection Rate : 0.003710        
##    Detection Prevalence : 0.004562        
##       Balanced Accuracy : 0.537020        
##                                           
##        'Positive' Class : 1               
## 
#####################################################################################################
# cross validation of Identity Hate training data
set.seed(17)

#labels for train and test
identity_hate <- train[train_index,]$identity_hate
part_test_identity_hate <- train[-train_index,]$identity_hate

part_train <- cbind(dataset[train_index,],identity_hate)
part_train$identity_hate = as.factor(part_train$identity_hate)
levels(part_train$identity_hate) = make.names(unique(part_train$identity_hate))
part_test <- dataset[-train_index,]

formula = identity_hate ~ .

set.seed(13)

start_time <-Sys.time()
IHXGBt = train(formula, data = part_train,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

IHXGBt
## eXtreme Gradient Boosting 
## 
## 119678 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 79785, 79786, 79785 
## Resampling results:
## 
##   ROC        Sens       Spec       
##   0.7780075  0.9997808  0.006729856
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 4.883873 mins
predIHt = predict(IHXGBt,part_test,type ='prob')
predIHt.resp <- ifelse(predIHt$X1 >= 0.80, 1, 0)
confusionMatrix(predIHt.resp,part_test_identity_hate, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 39528   364
##          1     0     1
##                                           
##                Accuracy : 0.9909          
##                  95% CI : (0.9899, 0.9918)
##     No Information Rate : 0.9909          
##     P-Value [Acc > NIR] : 0.4929          
##                                           
##                   Kappa : 0.0054          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 2.740e-03       
##             Specificity : 1.000e+00       
##          Pos Pred Value : 1.000e+00       
##          Neg Pred Value : 9.909e-01       
##              Prevalence : 9.149e-03       
##          Detection Rate : 2.507e-05       
##    Detection Prevalence : 2.507e-05       
##       Balanced Accuracy : 5.014e-01       
##                                           
##        'Positive' Class : 1               
## 
#####################################################################################################

Modelling using XGBoost

Toxic Calculation

We calculate the various targets and predict the probablities

# above to avoid package loading messages 

dataset2 = dataset
dataset2$toxic = train$toxic
dataset2$toxic = as.factor(dataset2$toxic)
levels(dataset2$toxic) = make.names(unique(dataset2$toxic))

formula = toxic ~ .

# (nrounds = 500, #number of iterations
# eta = .05, #learning rate lies between 0.01 - 0.30
# max_depth = 3, #controls the depth of the tree default is 6 but range 0 - Inf
# gamma = 0, #controls regularization to prevent overfitting, default is 0
# lambda = 0, #controls L2 regularization on weight to prevent overfitting
# colsample_bytree = .8, #controls the number of features supplied to a tree lies between 0.5 - 0.9
# min_child_weight = 1, # leaf node has a min sum of instance wgt < min_child_wgt, the tree splitting stops.
# subsample = 1 # controls the number of samples supplied to the tree range 0 - 1 with default at 1

set.seed(13)
start_time <-Sys.time()
ToxicXGB = train(formula, data = dataset2,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

ToxicXGB
## eXtreme Gradient Boosting 
## 
## 159571 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 106381, 106380, 106381 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7740541  0.9977751  0.1703936
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <-Sys.time()
end_time - start_time
## Time difference of 6.189664 mins
predToxic = predict(ToxicXGB,datasetTest,type ='prob')

plot(predToxic)

#####################################################################################################

Severe Toxic Calculation

We calculate the various targets and predict the probablities

dataset2 = dataset
dataset2$severe_toxic = train$severe_toxic
dataset2$severe_toxic = as.factor(dataset2$severe_toxic)
levels(dataset2$severe_toxic) = make.names(unique(dataset2$severe_toxic))

formula = severe_toxic ~ .

set.seed(13)
start_time <- Sys.time()
SToxicXGB = train(formula, data = dataset2,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
SToxicXGB
## eXtreme Gradient Boosting 
## 
## 159571 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 106381, 106381, 106380 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8693295  0.9989682  0.1097376
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.084725 mins
predSToxic = predict(SToxicXGB,datasetTest,type ='prob')

plot(predSToxic)

#####################################################################################################

Obscene Toxic Calculation

We calculate the various targets and predict the probablities

dataset2 = dataset
dataset2$obscene = train$obscene
dataset2$obscene = as.factor(dataset2$obscene)
levels(dataset2$obscene) = make.names(unique(dataset2$obscene))

formula = obscene ~ .

set.seed(13)

start_time <- Sys.time()
ObsceneXGB = train(formula, data = dataset2,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

ObsceneXGB
## eXtreme Gradient Boosting 
## 
## 159571 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 106380, 106381, 106381 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8134006  0.9987626  0.2784946
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.055944 mins
predObscene = predict(ObsceneXGB,datasetTest,type ='prob')

plot(predObscene)

#####################################################################################################

Threat Toxic Calculation

We calculate the various targets and predict the probablities

dataset2 = dataset
dataset2$threat = train$threat
dataset2$threat = as.factor(dataset2$threat)
levels(dataset2$threat) = make.names(unique(dataset2$threat))

formula = threat ~ .

set.seed(13)

start_time<-Sys.time()
ThreatXGB = train(formula, data = dataset2,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)

end_time <- Sys.time()
end_time-start_time
## Time difference of 6.088708 mins
predThreat = predict(ThreatXGB,datasetTest,type ='prob')

plot(predThreat)

#####################################################################################################

Insult Toxic Calculation

We calculate the various targets and predict the probablities

dataset2 = dataset
dataset2$insult = train$insult
dataset2$insult = as.factor(dataset2$insult)
levels(dataset2$insult) = make.names(unique(dataset2$insult))

formula = insult ~ .

set.seed(13)

start_time <- Sys.time()
InsultXGB = train(formula, data = dataset2,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
InsultXGB
## eXtreme Gradient Boosting 
## 
## 159571 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 106381, 106381, 106380 
## Resampling results:
## 
##   ROC        Sens      Spec     
##   0.8014859  0.996137  0.2132807
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.054872 mins
predInsult = predict(InsultXGB,datasetTest,type ='prob')

plot(predInsult)

#####################################################################################################

Identity Hate Toxic Calculation

We calculate the various targets and predict the probablities

dataset2 = dataset
dataset2$identity_hate = train$identity_hate
dataset2$identity_hate = as.factor(dataset2$identity_hate)
levels(dataset2$identity_hate) = make.names(unique(dataset2$identity_hate))

formula = identity_hate ~ .

set.seed(13)

start_time <- Sys.time()
IHXGB = train(formula, data = dataset2,
                 method = "xgbTree",trControl = fitControl,
                 tuneGrid = xgbGrid,na.action = na.pass,metric="ROC", maximize=FALSE)
IHXGB
## eXtreme Gradient Boosting 
## 
## 159571 samples
##    195 predictor
##      2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 106380, 106381, 106381 
## Resampling results:
## 
##   ROC        Sens      Spec       
##   0.7838228  0.999804  0.009251666
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
##  0.8
## Tuning parameter 'min_child_weight' was held constant at a value of
##  1
## Tuning parameter 'subsample' was held constant at a value of 1
end_time <- Sys.time()
end_time-start_time
## Time difference of 6.075111 mins
predIH = predict(IHXGB,datasetTest,type ='prob')

plot(predIH)

#####################################################################################################

submission

submission =testdata = fread("sample_submission.csv", header = "auto", sep="auto", nrows=-1L,blank.lines.skip=TRUE, encoding="UTF-8")

submission$toxic = predToxic$X1
submission$severe_toxic = predSToxic$X1
submission$obscene = predObscene$X1
submission$threat = predThreat$X1
submission$insult = predInsult$X1
submission$identity_hate = predIH$X1

# Write it to file
write.csv(submission, 'ToxicCommentsMar312018.csv', row.names = F)